In [1]:
import pandas as pd
import numpy as np
import os
print(os.listdir("../input"))

import matplotlib.pyplot as plt
import matplotlib.cm as cm

import seaborn as sb

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode,iplot
import plotly.figure_factory as ff
init_notebook_mode(connected=True)

import re

# Any results you write to the current directory are saved as output.
['movies1', 'movies']
In [2]:
m1 = pd.read_csv('../input/movies/tmdb_5000_credits.csv')
m2 = pd.read_csv('../input/movies/tmdb_5000_movies.csv')
m1.shape , m2.shape
Out[2]:
((4803, 4), (4803, 20))
In [3]:
m1.head(2)
Out[3]:
movie_id title cast crew
0 19995 Avatar [{"cast_id": 242, "character": "Jake Sully", "... [{"credit_id": "52fe48009251416c750aca23", "de...
1 285 Pirates of the Caribbean: At World's End [{"cast_id": 4, "character": "Captain Jack Spa... [{"credit_id": "52fe4232c3a36847f800b579", "de...
In [4]:
pd.read_json(m1['crew'][0]).head(2)
Out[4]:
credit_id department gender id job name
0 52fe48009251416c750aca23 Editing 0 1721 Editor Stephen E. Rivkin
1 539c47ecc3a36810e3001f87 Art 2 496 Production Design Rick Carter
In [5]:
pd.read_json(m1['cast'][0]).head(2)
Out[5]:
cast_id character credit_id gender id name order
0 242 Jake Sully 5602a8a7c3a3685532001c9a 2 65731 Sam Worthington 0
1 3 Neytiri 52fe48009251416c750ac9cb 1 8691 Zoe Saldana 1
In [6]:
def extractnum(x):
    try:
        return len(pd.read_json(x))
    except:
        return 0

def extractdirector(x):
    try:
        df = pd.read_json(x)
        return df[df['department'] == 'Directing']['name'].unique()[0]
    except:
        return 'Not found'
    
def extractlead(x):
    try:
        df = pd.read_json(x)
        return df['name'][0]
    except:
        return 'Not found'
In [7]:
m1['Director'] = m1['crew'].apply(extractdirector)
m1['total_crew'] = m1['crew'].apply(extractnum)
m1['total_cast'] = m1['cast'].apply(extractnum)
m1['lead_actor'] = m1['cast'].apply(extractlead)
In [8]:
m1.head(2)
Out[8]:
movie_id title cast crew Director total_crew total_cast lead_actor
0 19995 Avatar [{"cast_id": 242, "character": "Jake Sully", "... [{"credit_id": "52fe48009251416c750aca23", "de... James Cameron 153 83 Sam Worthington
1 285 Pirates of the Caribbean: At World's End [{"cast_id": 4, "character": "Captain Jack Spa... [{"credit_id": "52fe4232c3a36847f800b579", "de... Gore Verbinski 32 34 Johnny Depp
In [9]:
m11 = m1.drop(['cast','crew'],axis =1)
m11.head(2)
Out[9]:
movie_id title Director total_crew total_cast lead_actor
0 19995 Avatar James Cameron 153 83 Sam Worthington
1 285 Pirates of the Caribbean: At World's End Gore Verbinski 32 34 Johnny Depp
In [10]:
# CLEANING M2
In [11]:
m2.head(2)
Out[11]:
budget genres homepage id keywords original_language original_title overview popularity production_companies production_countries release_date revenue runtime spoken_languages status tagline title vote_average vote_count
0 237000000 [{"id": 28, "name": "Action"}, {"id": 12, "nam... http://www.avatarmovie.com/ 19995 [{"id": 1463, "name": "culture clash"}, {"id":... en Avatar In the 22nd century, a paraplegic Marine is di... 150.437577 [{"name": "Ingenious Film Partners", "id": 289... [{"iso_3166_1": "US", "name": "United States o... 2009-12-10 2787965087 162.0 [{"iso_639_1": "en", "name": "English"}, {"iso... Released Enter the World of Pandora. Avatar 7.2 11800
1 300000000 [{"id": 12, "name": "Adventure"}, {"id": 14, "... http://disney.go.com/disneypictures/pirates/ 285 [{"id": 270, "name": "ocean"}, {"id": 726, "na... en Pirates of the Caribbean: At World's End Captain Barbossa, long believed to be dead, ha... 139.082615 [{"name": "Walt Disney Pictures", "id": 2}, {"... [{"iso_3166_1": "US", "name": "United States o... 2007-05-19 961000000 169.0 [{"iso_639_1": "en", "name": "English"}] Released At the end of the world, the adventure begins. Pirates of the Caribbean: At World's End 6.9 4500
In [12]:
#GENRE EXTRACTION

d = pd.read_json(m2['genres'][0])
d
Out[12]:
id name
0 28 Action
1 12 Adventure
2 14 Fantasy
3 878 Science Fiction
In [13]:
from functools import reduce

df = pd.read_json(m2['keywords'][0])
reduce(lambda x,y: x+' '+y ,list(df['name']))
Out[13]:
'culture clash future space war space colony society space travel futuristic romance space alien tribe alien planet cgi marine soldier battle love affair anti war power relations mind and soul 3d'
In [14]:
def extractgenre(x):
    try:
        df = pd.read_json(x)
        return reduce(lambda x,y: x+' '+y ,list(df['name']))
    except:
        return 'None'
In [15]:
gen = pd.DataFrame()
gen['genre'] = m2['genres'].apply(extractgenre)
In [16]:
### COUNT VECTORIZER TO CREATE A TFM OF GENRES

from sklearn.feature_extraction.text import CountVectorizer
In [17]:
cv = CountVectorizer(stop_words= 'english')
genbw = cv.fit_transform(gen['genre'])

genre = pd.DataFrame(genbw.toarray(),columns=cv.get_feature_names())
genre.head(2)
Out[17]:
action adventure animation comedy crime documentary drama family fantasy fiction foreign history horror movie music mystery romance science thriller tv war western
0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0
1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
In [18]:
pd.read_json(m2['production_companies'][0]).head(2)
Out[18]:
id name
0 289 Ingenious Film Partners
1 306 Twentieth Century Fox Film Corporation
In [19]:
pd.read_json(m2['production_countries'][0]).head(2)
Out[19]:
iso_3166_1 name
0 US United States of America
1 GB United Kingdom
In [20]:
def extractpc(x):
    try:
        df = pd.read_json(x)
        return df['name'][0]
    except:
        return 'None'
In [21]:
m2['pro_company'] = m2['production_companies'].apply(extractpc)
m2['pro_country'] = m2['production_countries'].apply(extractpc)
In [22]:
### ROPPING ALL THE UNNECESSARY COLUMNS

m21 = m2.drop(['genres','homepage','keywords','production_countries','overview','production_companies','spoken_languages','tagline','title'],axis =1)
m21.head(2)
Out[22]:
budget id original_language original_title popularity release_date revenue runtime status vote_average vote_count pro_company pro_country
0 237000000 19995 en Avatar 150.437577 2009-12-10 2787965087 162.0 Released 7.2 11800 Ingenious Film Partners United States of America
1 300000000 285 en Pirates of the Caribbean: At World's End 139.082615 2007-05-19 961000000 169.0 Released 6.9 4500 Walt Disney Pictures United States of America
In [23]:
## MERGING THE DATAFRAMES
In [24]:
movies = pd.merge(m11,m21,how = 'inner' , left_on='movie_id',right_on='id')
movies.head(2)
Out[24]:
movie_id title Director total_crew total_cast lead_actor budget id original_language original_title popularity release_date revenue runtime status vote_average vote_count pro_company pro_country
0 19995 Avatar James Cameron 153 83 Sam Worthington 237000000 19995 en Avatar 150.437577 2009-12-10 2787965087 162.0 Released 7.2 11800 Ingenious Film Partners United States of America
1 285 Pirates of the Caribbean: At World's End Gore Verbinski 32 34 Johnny Depp 300000000 285 en Pirates of the Caribbean: At World's End 139.082615 2007-05-19 961000000 169.0 Released 6.9 4500 Walt Disney Pictures United States of America
In [25]:
## REMOVING ID COLUMN AS ITS REDUNDANT AND CONVERTING RELEASE DATE TO DATE  
movies['release_date'] = pd.to_datetime(movies['release_date']).dt.date
movies.drop(['id'],axis =1 ,inplace=True)
In [26]:
movies['status'].value_counts()
Out[26]:
Released           4795
Rumored               5
Post Production       3
Name: status, dtype: int64
In [27]:
movies1 = movies[(movies['status'] == 'Released') & (movies['budget'] > 0) & (movies['revenue'] >0)]
movies1.head(2)
Out[27]:
movie_id title Director total_crew total_cast lead_actor budget original_language original_title popularity release_date revenue runtime status vote_average vote_count pro_company pro_country
0 19995 Avatar James Cameron 153 83 Sam Worthington 237000000 en Avatar 150.437577 2009-12-10 2787965087 162.0 Released 7.2 11800 Ingenious Film Partners United States of America
1 285 Pirates of the Caribbean: At World's End Gore Verbinski 32 34 Johnny Depp 300000000 en Pirates of the Caribbean: At World's End 139.082615 2007-05-19 961000000 169.0 Released 6.9 4500 Walt Disney Pictures United States of America
In [28]:
### SIMPLE FUNTIONS TO IMPLEMENT PLOTLY GRAPHS

def boxtrace(df=None, col_name=None, boxpoints='outliers', boxmean=True):
    return go.Box(y=df[col_name],name=col_name,boxpoints = boxpoints, boxmean=boxmean)

def pietrace(df, col_name=None):
    tmp = df[col_name].value_counts()
    return go.Pie(labels=list(tmp.index), values=list(tmp.values))

def violintrace(df=None, x_col=None, y_col=None, name=None):
    if not x_col:
        return go.Violin(y=df[y_col], box={"visible": True}, meanline={"visible": True}, name=name)
    return go.Violin(x=df[x_col], y=df[y_col], box={"visible": True}, meanline={"visible": True}, name=name)

def distplot(df=None, col_names=[], show_hist=False):
    data = [df[x].fillna(-1) for x in col_names]
    return ff.create_distplot(data, col_names, show_hist=show_hist)

def bartrace(df=None, x_col=None, y_col=None, name=None):
    return go.Bar(
        y=df[y_col],
        x=df[x_col],
        name=name
    )

def scattertrace(df=None, x_col=None, y_col=None, hover_col=None):
    return go.Scatter(
        y = df[y_col],
        x = df[x_col],
        hovertext= df[hover_col],
        mode='markers',
        marker=dict(
        size=16,
        color = np.random.randn(500),                         #set color equal to a variable
        colorscale='Viridis',
        showscale=True
    )
    )

COUNTRIES WISE ANALYSIS**

In [29]:
## COMPOSITION OF MOVIES GENERATED BY COUNTRIES

df =pd.DataFrame(movies['pro_country'].value_counts()[0:10])

df = df.drop(['None'],axis =0)

iplot([go.Pie(labels=list(df.index), values=list(df.pro_country))])

COUNTRY WICE TOTAL BUDGETS AND REVENUES GENERATED**

In [30]:
df = movies1.groupby('pro_country')[['budget','revenue']].sum().sort_values('revenue',ascending =False).head(5)
df
Out[30]:
budget revenue
pro_country
United States of America 91104705990 280520086338
United Kingdom 10693231151 36541235127
Germany 7330496682 16669142023
Canada 4657093926 11993707853
France 3369200000 8105107529
In [31]:
data = [go.Bar(x=df.index,y=df['budget'],name='BUDGET'),\
       go.Bar(x=df.index,y=df['revenue'],name='REVENUE')]
    
layout = go.Layout(
    barmode='group'
)

iplot(go.Figure(data=data, layout=layout), filename='grouped-bar')

TOP PRODUCTION COMPANIES**

In [32]:
df =pd.DataFrame(movies['pro_company'].value_counts()[0:10])

df = df.drop(['None'],axis =0)

iplot([go.Pie(labels=list(df.index), values=list(df.pro_company))])

BUDGETS AND REVENUES**

In [33]:
iplot([boxtrace(df=movies1, col_name="revenue"), boxtrace(df=movies1, col_name="budget")])

BETTER INSIGHTS THROUGH THE VOILIN PLOTS

In [34]:
iplot([violintrace(df=movies1, y_col="budget"),violintrace(df=movies1, y_col="revenue")])

THE CORRELATION BETWEEN BUDGET AND REVENUE

In [35]:
iplot([scattertrace(movies1,x_col='budget',y_col = 'revenue',hover_col='title')])
In [36]:
## PRODUCTION COMPANT WICE BUDGETS COMPARISON

data=[violintrace(df=movies1[movies1["pro_company"] == "Paramount Pictures"], y_col="budget", name="Paramount Pictures"),
      violintrace(df=movies1[movies1["pro_company"] == "Universal Pictures"], y_col="budget", name="Universal Pictures"),
      violintrace(df=movies1[movies1["pro_company"] == "Columbia Pictures"], y_col="budget", name="Columbia Pictures"),
      violintrace(df=movies1[movies1["pro_company"] == "Twentieth Century Fox Film Corporation"], y_col="budget", name="Twentieth Century Fox Film Corporation")]
layout = {
        "title": "BUDGETS COMAPARISION",
        "yaxis": {
            "zeroline": False,
        },
        "violinmode": "group"
    }
fig = go.Figure(data=data, layout=layout)
iplot(fig)

PRODUCTION COMPANY WICE REVENUES GENERATED

In [37]:
data=[violintrace(df=movies1[movies1["pro_company"] == "Paramount Pictures"], y_col="revenue", name="Paramount Pictures"),
      violintrace(df=movies1[movies1["pro_company"] == "Universal Pictures"], y_col="revenue", name="Universal Pictures"),
      violintrace(df=movies1[movies1["pro_company"] == "Columbia Pictures"], y_col="revenue", name="Columbia Pictures"),
      violintrace(df=movies1[movies1["pro_company"] == "Twentieth Century Fox Film Corporation"], y_col="revenue", name="Twentieth Century Fox Film Corporation")]
layout = {
        "title": "REVENUES COMAPARISION",
        "yaxis": {
            "zeroline": False,
        },
        "violinmode": "group"
    }
fig = go.Figure(data=data, layout=layout)
iplot(fig)
In [38]:
movies.head(1)
Out[38]:
movie_id title Director total_crew total_cast lead_actor budget original_language original_title popularity release_date revenue runtime status vote_average vote_count pro_company pro_country
0 19995 Avatar James Cameron 153 83 Sam Worthington 237000000 en Avatar 150.437577 2009-12-10 2787965087 162.0 Released 7.2 11800 Ingenious Film Partners United States of America

SURFACE PLOT OF REVENUE WITH POPULARITY AND BUDGET

In [39]:
pp = movies1.pivot(index='popularity', columns='budget', values='revenue').fillna(0).values.tolist()

iplot([go.Surface(z=pp)])

MOVIE WICE**

In [40]:
##MOST POPULAR MOVIES

df = movies.sort_values('popularity',ascending=False).head(10)
iplot([go.Bar(y=df['popularity'],x=df['title'], name='TOP 10 POPULAR MOVIES')])

TOP MOVIES

In [41]:
movies2 = movies1[movies1['budget'] >1000000]

movies1['profit_mve'] = (movies2['revenue']-movies2['budget'])/movies2['budget']

df = movies1.sort_values('profit_mve',ascending =False).head(10)[['title','profit_mve']]


iplot([go.Bar(y=df['profit_mve'],x=df['title'], name='TOP 10 PROFIT GENERATED MOVIES')])

THE correlation between vote count and average RATING

In [42]:
iplot([scattertrace(movies1,x_col='vote_count',y_col = 'vote_average',hover_col='title')])

YEAR WICE

In [43]:
df = movies1.sort_values('release_date')

data = [go.Scatter( x = df['release_date'],y = df['revenue'],mode = 'lines+markers',name = 'REVENUE'),
             go.Scatter(x = df['release_date'],y = df['budget'],mode = 'lines',name = 'BUDGET')]

iplot(data, filename='line-mode')

The cast and crew correlation

In [44]:
iplot([scattertrace(movies1,x_col='total_crew',y_col = 'total_cast',hover_col='title')])

ACTORS

USING THE COUNT VECTORISER

In [45]:
m3 = pd.read_csv('../input/movies/movies_data.csv')
m3.head(2)
Out[45]:
actor_1_name vote_average title_year movie_title Mystery Foreign Crime Western Drama Family Horror Documentary Animation Fantasy Music History Comedy Romance War Thriller Science Fiction Action TV Movie Adventure
0 Zoe Saldana 7.2 2009.0 Avatar 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1
1 Orlando Bloom 6.9 2007.0 Pirates of the Caribbean: At World's End 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1
In [46]:
m3.actor_1_name.value_counts()[0:5]
Out[46]:
Jennifer Aniston      15
Morgan Freeman        13
Samuel L. Jackson     12
Brad Pitt             12
Scarlett Johansson    11
Name: actor_1_name, dtype: int64
In [47]:
m3.iloc[:,4:].apply(np.sum,axis =0).sort_values(ascending = False).head(6).index
Out[47]:
Index(['Drama', 'Comedy', 'Thriller', 'Action', 'Romance', 'Adventure'], dtype='object')
In [48]:
df = m3.groupby('actor_1_name')[['Drama', 'Comedy', 'Thriller', 'Action', 'Romance', 'Adventure']].sum().reset_index()
df.head(2)
Out[48]:
actor_1_name Drama Comedy Thriller Action Romance Adventure
0 A.J. Buckley 0 1 0 0 0 0
1 A.J. Cook 0 0 0 0 0 0

MY FAVORITE POKEMON STYLE CHART (RADIAL GRAPH)**

In [49]:
x = df[df["actor_1_name"] == "Jennifer Aniston"]
y = df[df["actor_1_name"] == "Brad Pitt"]
data = [go.Scatterpolar(
  r = [x['Drama'].values[0],x['Comedy'].values[0],x['Thriller'].values[0],x['Action'].values[0],x['Romance'].values[0],x['Adventure'].values[0]],
  theta = ['Drama', 'Comedy', 'Thriller', 'Action', 'Romance', 'Adventure'],
  fill = 'toself',
  name=x["actor_1_name"].values[0]
),
       go.Scatterpolar(
  r = [y['Drama'].values[0],y['Comedy'].values[0],y['Thriller'].values[0],y['Action'].values[0],y['Romance'].values[0],y['Adventure'].values[0]],
  theta = ['Drama', 'Comedy', 'Thriller', 'Action', 'Romance', 'Adventure'],
  fill = 'toself',
  name=y["actor_1_name"].values[0]
)]

layout = go.Layout(
  polar = dict(
    radialaxis = dict(
      visible = True,
    )
  ),
  showlegend = True,
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "ACTOR CAREER STATS")